packages for data cleaning and visualisation

install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("dyplr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

read data file into easier name for reference and load data

data <- read.csv("titanic_vis_clean.csv")

get a global understanding of data structure and summary statistics

head(data)
##   passengerid survived pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  name    sex age sibsp parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  28     0     0
##             ticket    fare embarked data.sex as.factor.sex. as.factor.survived.
## 1        A/5 21171  7.2500        S     male           male                   0
## 2         PC 17599 71.2833        C   female         female                   1
## 3 STON/O2. 3101282  7.9250        S   female         female                   1
## 4           113803 53.1000        S   female         female                   1
## 5           373450  8.0500        S     male           male                   0
## 6           330877  8.4583        Q     male           male                   0
##   as.factor.embarked.
## 1                   S
## 2                   C
## 3                   S
## 4                   S
## 5                   S
## 6                   Q
summary(data)
##   passengerid     survived          pclass          name          
##  Min.   :  1   Min.   :0.0000   Min.   :1.000   Length:889        
##  1st Qu.:224   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446   Mean   :0.3825   Mean   :2.312                     
##  3rd Qu.:668   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891   Max.   :1.0000   Max.   :3.000                     
##      sex                 age            sibsp            parch       
##  Length:889         Min.   : 0.42   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:22.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.0000   Median :0.0000  
##                     Mean   :29.32   Mean   :0.5242   Mean   :0.3825  
##                     3rd Qu.:35.00   3rd Qu.:1.0000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.0000   Max.   :6.0000  
##     ticket               fare           embarked           data.sex        
##  Length:889         Min.   :  0.000   Length:889         Length:889        
##  Class :character   1st Qu.:  7.896   Class :character   Class :character  
##  Mode  :character   Median : 14.454   Mode  :character   Mode  :character  
##                     Mean   : 32.097                                        
##                     3rd Qu.: 31.000                                        
##                     Max.   :512.329                                        
##  as.factor.sex.     as.factor.survived. as.factor.embarked.
##  Length:889         Min.   :0.0000      Length:889         
##  Class :character   1st Qu.:0.0000      Class :character   
##  Mode  :character   Median :0.0000      Mode  :character   
##                     Mean   :0.3825                         
##                     3rd Qu.:1.0000                         
##                     Max.   :1.0000
str(data)
## 'data.frame':    889 obs. of  15 variables:
##  $ passengerid        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ survived           : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ pclass             : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ name               : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ sex                : chr  "male" "female" "female" "female" ...
##  $ age                : num  22 38 26 35 35 28 54 2 27 14 ...
##  $ sibsp              : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ parch              : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ ticket             : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ fare               : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ embarked           : chr  "S" "C" "S" "S" ...
##  $ data.sex           : chr  "male" "female" "female" "female" ...
##  $ as.factor.sex.     : chr  "male" "female" "female" "female" ...
##  $ as.factor.survived.: int  0 1 1 1 0 0 0 0 1 1 ...
##  $ as.factor.embarked.: chr  "S" "C" "S" "S" ...

Count of Survived passengers

# barplot
ggplot(data, aes(x = as.factor(survived)))+
         geom_bar()+
         xlab("survived")+
         ylab("count")+
         ggtitle("count of survived passengers")

Scatter plot of age distribution and fare price. Most fares are clustered upto 100 fare price, with few outliers above 200 and 500 for those aged between 20-40 years old.

# scatter plot      
ggplot(data, aes(x = age, y = fare)) +
  geom_point()

Histogram diplaying Age distribution and count. most passengers were ~25-40 years old

#histogram
ggplot(data, aes(x = age))+
  geom_histogram(binwidth = 5, fill="blue", color="black")+
  xlab("age")+
  ylab("count")+
  ggtitle("age distribution of passengers")

Boxlpot showing age distribution and survival status. Median age is higher for those who did not survive (left box), indicating those younger had a higher survival rate.

#boxplot
ggplot(data, aes(x = as.factor(survived), y = age))+
  geom_boxplot()+
  xlab("survived")+
  ylab("age")+
  ggtitle("age distribution and density by survival status")

Violin plot displaying age distribution and survival status. majority were ~25-30 including those that survived and not survived

#violinplot
ggplot(data, aes(x = as.factor(survived), y = age))+
  geom_violin()+
  xlab("survived")+
  ylab("age")+
  ggtitle("age distribution and density by survival status")

bar plot of passenger class type and count

#bar plot of passenger class
ggplot(data, aes(x = as.factor(data$pclass)))+
  geom_bar(fill = "green")+
  xlab("passenger class")+
  ylab("count")+
  ggtitle("count of passenger class")

bar plot of Embarkation location and count. Cherbourge (c), Queeenstown (Q), Southampton (S)

#bar plot of Embarked
ggplot(data, aes(x = embarked))+
  geom_bar(fill = "lightblue")+
  xlab("embarking point")+
  ylab("count")+
  ggtitle("count of passenger class")

Scatter plot shoing those who paid the highest fares had a higher chance of survival.

#scatterplot of age vs fare and survival
ggplot(data, aes(x = age, y = fare))+
  geom_point()+
  facet_grid(. ~ survived)+
  xlab("age")+
  ylab("fare")+
  ggtitle("age vs fare by survival staus")

Scatter plot showing most passengers had a class 1 ticket

#scatterplot of age vs fare and Pclass

#scatterplot of age vs fare and Pclass
ggplot(data, aes(x = age, y = fare))+
  geom_point(color = "red")+
  facet_grid(. ~ pclass)+
  xlab("age")+
  ylab("fare")+
  ggtitle("age vs fare by passenger class")

  # Combined scatter plot of Age vs Fare by Pclass
  ggplot(data, aes(x = age, y = fare, color = as.factor(pclass))) +
  geom_point(size = 2) +
  scale_color_manual(values = c("1" = "red", "2" = "orange", "3" = "green")) +
  xlab("Age") +
  ylab("Fare") +
  ggtitle("Age vs. Fare by Passenger Class") +
  labs(color = "Passenger Class")

installation of ‘plotly’ package for interactive stacked bar plots

install.packages("plotly")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if (!requireNamespace("dplyr", quietly = TRUE)) {
  install.packages("dplyr")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
  install.packages("plotly")
}
library(dplyr)
library(plotly)

data preperation for interactive bar plot by creating new features

data <- data %>%
  group_by(pclass, survived) %>%
  summarise(count = n(), .groups = 'drop') %>%
  mutate(percentage = count / sum(count) * 100)

# Convert to data frame
data <- as.data.frame(data)

# Convert survived to factor
data$survived <- as.factor(data$survived)
# interactive stacked bar plot

plot <- plot_ly(data, 
                x = ~pclass, 
                y = ~percentage, 
                type = 'bar', 
                color = ~survived,
                text = ~paste('Survived:', survived, '<br>Percentage:', round(percentage, 2), '%'),
                hoverinfo = 'text',
                textposition = 'auto') %>%
  layout(barmode = 'stack',
         xaxis = list(title = 'Passenger Class'),
         yaxis = list(title = 'Percentage'),
         title = 'Survival Proportions by Passenger Class',
         legend = list(title = list(text = 'Survived')))

plot
Survived: 0 Percentage: 9 %Survived: 0 Percentage: 10.91 %Survived: 0 Percentage: 41.84 %Survived: 1 Percentage: 15.07 %Survived: 1 Percentage: 9.79 %Survived: 1 Percentage: 13.39 %0.511.522.533.501020304050
Survived10Survival Proportions by Passenger ClassPassenger ClassPercentage

scatter plot of age vs fare colored by embarkation with linear regression (lm).

# scatter plot of age vs fare colored by embarkation 

data <- read.csv("titanic_vis_clean.csv")

# scatter plot of age vs fare colored by embarkation with linear regression (lm)
ggplot(data, aes (x = age, y = fare, color= embarked))+
  geom_point()+
  geom_smooth(method = "lm", col = "blue")+
  xlab("age")+
  ylab("fare")+
  ggtitle("age and fare by embarkation")
## `geom_smooth()` using formula = 'y ~ x'